{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn import datasets\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "import torch\n",
    "from torch.utils.data import DataLoader\n",
    "from torch.utils.data import TensorDataset\n",
    "\n",
    "\n",
    "class KddData(object):\n",
    "\n",
    "    def __init__(self, batch_size):\n",
    "        kddcup99 = datasets.fetch_kddcup99()\n",
    "        self._encoder = {\n",
    "            'protocal': LabelEncoder(),\n",
    "            'service':  LabelEncoder(),\n",
    "            'flag':     LabelEncoder(),\n",
    "            'label':    LabelEncoder()\n",
    "        }\n",
    "        self.batch_size = batch_size\n",
    "        data_X, data_y = self.__encode_data(kddcup99.data, kddcup99.target)\n",
    "        self.train_dataset, self.test_dataset = self.__split_data_to_tensor(data_X, data_y)\n",
    "        self.train_dataloader = DataLoader(self.train_dataset, self.batch_size, shuffle=True)\n",
    "        self.test_dataloader = DataLoader(self.test_dataset, self.batch_size, shuffle=True)\n",
    "\n",
    "\n",
    "    \"\"\"将数据中字符串部分转换为数字，并将输入的41维特征转换为8*8的矩阵\"\"\"\n",
    "    def __encode_data(self, data_X, data_y):\n",
    "        self._encoder['protocal'].fit(list(set(data_X[:, 1])))\n",
    "        self._encoder['service'].fit(list(set(data_X[:, 2])))\n",
    "        self._encoder['flag'].fit((list(set(data_X[:, 3]))))\n",
    "        self._encoder['label'].fit(list(set(data_y)))\n",
    "        data_X[:, 1] = self._encoder['protocal'].transform(data_X[:, 1])\n",
    "        data_X[:, 2] = self._encoder['service'].transform(data_X[:, 2])\n",
    "        data_X[:, 3] = self._encoder['flag'].transform(data_X[:, 3])\n",
    "        data_X = np.pad(data_X, ((0, 0), (0, 64 - len(data_X[0]))), 'constant').reshape(-1, 1, 8, 8)\n",
    "        data_y = self._encoder['label'].transform(data_y)\n",
    "        return data_X, data_y\n",
    "\n",
    "    \"\"\"将数据拆分为训练集和测试集，并转换为TensorDataset对象\"\"\"\n",
    "    def __split_data_to_tensor(self, data_X, data_y):\n",
    "        X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.3)\n",
    "        train_dataset = TensorDataset(\n",
    "            torch.from_numpy(X_train.astype(np.float32)),\n",
    "            torch.from_numpy(y_train.astype(np.int))\n",
    "        )\n",
    "        test_dataset = TensorDataset(\n",
    "            torch.from_numpy(X_test.astype(np.float32)),\n",
    "            torch.from_numpy(y_test.astype(np.int))\n",
    "        )\n",
    "        return train_dataset, test_dataset\n",
    "\n",
    "    \"\"\"接受一个数组进行解码\"\"\"\n",
    "    def decode(self, data, label=False):\n",
    "        if not label:\n",
    "            _data = list(data)\n",
    "            _data[1] = self._encoder['protocal'].inverse_transform([_data[1]])[0]\n",
    "            _data[2] = self._encoder['service'].inverse_transform([_data[2]])[0]\n",
    "            _data[2] = self._encoder['flag'].inverse_transform([_data[3]])[0]\n",
    "            return _data\n",
    "        return self._encoder['label'].inverse_transform(data)\n",
    "    \n",
    "    def encode(self, data, label=False):\n",
    "        if not label:\n",
    "            _data = list(data)\n",
    "            _data[1] = self._encoder['protocal'].transform([_data[1]])[0]\n",
    "            _data[2] = self._encoder['service'].transform([_data[2]])[0]\n",
    "            _data[3] = self._encoder['flag'].transform([_data[3]])[0]\n",
    "            return _data\n",
    "        return self._encoder['label'].transform([data])[0]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch.nn as nn\n",
    "\n",
    "class CNN(nn.Module):\n",
    "    def __init__(self, in_dim, n_class):\n",
    "        super(CNN, self).__init__()\n",
    "\n",
    "        self.conv = nn.Sequential(\n",
    "            nn.Conv2d(in_dim, 6, 3, stride=1, padding=1),\n",
    "            nn.BatchNorm2d(6),\n",
    "            nn.ReLU(True),\n",
    "            nn.Conv2d(6, 16, 3, stride=1, padding=0),\n",
    "            nn.BatchNorm2d(16),\n",
    "            nn.ReLU(True),\n",
    "            nn.MaxPool2d(2, 2)\n",
    "        )\n",
    "\n",
    "        self.fc = nn.Sequential(\n",
    "            nn.Linear(144, 512),\n",
    "            nn.Linear(512, 256),\n",
    "            nn.Linear(256, n_class)\n",
    "        )\n",
    "\n",
    "    def forward(self, x):\n",
    "        out = self.conv(x)\n",
    "        out = out.view(out.size(0), -1)\n",
    "        out = self.fc(out)\n",
    "        return out\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "from torch.autograd import Variable\n",
    "# 神经网络参数\n",
    "batch_size = 128\n",
    "learning_rate = 1e-2\n",
    "num_epoches = 20\n",
    "USE_GPU = torch.cuda.is_available()\n",
    "\n",
    "\n",
    "dataset = KddData(batch_size)\n",
    "model = CNN(1, 23)\n",
    "\n",
    "def train():\n",
    "    \n",
    "    global model\n",
    "\n",
    "    if USE_GPU:\n",
    "        model = model.cuda()\n",
    "\n",
    "    criterion = nn.CrossEntropyLoss()\n",
    "    optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n",
    "\n",
    "    for epoch in range(num_epoches):\n",
    "        print('epoch {}'.format(epoch + 1))\n",
    "        print('*' * 10)\n",
    "        running_loss = 0.0\n",
    "        running_acc = 0.0\n",
    "        for i, data in enumerate(dataset.train_dataloader, 1):\n",
    "            img, label = data\n",
    "            if USE_GPU:\n",
    "                img = img.cuda()\n",
    "                label = label.cuda()\n",
    "            img = Variable(img)\n",
    "            label = Variable(label)\n",
    "            # 向前传播\n",
    "            out = model(img)\n",
    "            loss = criterion(out, label)\n",
    "            running_loss += loss.item() * label.size(0)\n",
    "            _, pred = torch.max(out, 1)\n",
    "            num_correct = (pred == label).sum()\n",
    "            accuracy = (pred == label).float().mean()\n",
    "            running_acc += num_correct.item()\n",
    "            # 向后传播\n",
    "            optimizer.zero_grad()\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "\n",
    "        print('Finish {} epoch, Loss: {:.6f}, Acc: {:.6f}'.format(\n",
    "            epoch + 1, running_loss / (len(dataset.train_dataset)), running_acc / (len(\n",
    "                dataset.train_dataset))))\n",
    "        model.eval()\n",
    "        eval_loss = 0\n",
    "        eval_acc = 0\n",
    "        for data in dataset.test_dataloader:\n",
    "            img, label = data\n",
    "            if USE_GPU:\n",
    "                img = Variable(img, volatile=True).cuda()\n",
    "                label = Variable(label, volatile=True).cuda()\n",
    "            else:\n",
    "                img = Variable(img, volatile=True)\n",
    "                label = Variable(label, volatile=True)\n",
    "            out = model(img)\n",
    "            loss = criterion(out, label)\n",
    "            eval_loss += loss.item() * label.size(0)\n",
    "            _, pred = torch.max(out, 1)\n",
    "            num_correct = (pred == label).sum()\n",
    "            eval_acc += num_correct.item()\n",
    "        print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(\n",
    "            dataset.test_dataset)), eval_acc / (len(dataset.test_dataset))))\n",
    "        print()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 1\n",
      "**********\n",
      "Finish 1 epoch, Loss: 0.208683, Acc: 0.954045\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:59: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
      "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:60: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Loss: 1.635921, Acc: 0.568327\n",
      "\n",
      "epoch 2\n",
      "**********\n",
      "Finish 2 epoch, Loss: 0.098167, Acc: 0.982132\n",
      "Test Loss: 0.154757, Acc: 0.988131\n",
      "\n",
      "epoch 3\n",
      "**********\n",
      "Finish 3 epoch, Loss: 0.050068, Acc: 0.987970\n",
      "Test Loss: 0.118267, Acc: 0.988482\n",
      "\n",
      "epoch 4\n",
      "**********\n",
      "Finish 4 epoch, Loss: 0.041440, Acc: 0.988893\n",
      "Test Loss: 0.099372, Acc: 0.989123\n",
      "\n",
      "epoch 5\n",
      "**********\n",
      "Finish 5 epoch, Loss: 0.038163, Acc: 0.989390\n",
      "Test Loss: 0.099686, Acc: 0.989002\n",
      "\n",
      "epoch 6\n",
      "**********\n",
      "Finish 6 epoch, Loss: 0.035749, Acc: 0.989891\n",
      "Test Loss: 0.082922, Acc: 0.989710\n",
      "\n",
      "epoch 7\n",
      "**********\n",
      "Finish 7 epoch, Loss: 0.033967, Acc: 0.990402\n",
      "Test Loss: 0.091113, Acc: 0.990682\n",
      "\n",
      "epoch 8\n",
      "**********\n",
      "Finish 8 epoch, Loss: 0.032350, Acc: 0.990891\n",
      "Test Loss: 0.081245, Acc: 0.991208\n",
      "\n",
      "epoch 9\n",
      "**********\n",
      "Finish 9 epoch, Loss: 0.030806, Acc: 0.991400\n",
      "Test Loss: 0.086804, Acc: 0.992349\n",
      "\n",
      "epoch 10\n",
      "**********\n",
      "Finish 10 epoch, Loss: 0.029359, Acc: 0.991764\n",
      "Test Loss: 0.073397, Acc: 0.991694\n",
      "\n",
      "epoch 11\n",
      "**********\n",
      "Finish 11 epoch, Loss: 0.028263, Acc: 0.991955\n",
      "Test Loss: 0.083497, Acc: 0.991654\n",
      "\n",
      "epoch 12\n",
      "**********\n",
      "Finish 12 epoch, Loss: 0.027390, Acc: 0.992158\n",
      "Test Loss: 0.063012, Acc: 0.991627\n",
      "\n",
      "epoch 13\n",
      "**********\n",
      "Finish 13 epoch, Loss: 0.026482, Acc: 0.992273\n",
      "Test Loss: 0.062718, Acc: 0.992301\n",
      "\n",
      "epoch 14\n",
      "**********\n",
      "Finish 14 epoch, Loss: 0.025874, Acc: 0.992502\n",
      "Test Loss: 0.068418, Acc: 0.993226\n",
      "\n",
      "epoch 15\n",
      "**********\n",
      "Finish 15 epoch, Loss: 0.025420, Acc: 0.992505\n",
      "Test Loss: 0.062197, Acc: 0.993415\n",
      "\n",
      "epoch 16\n",
      "**********\n",
      "Finish 16 epoch, Loss: 0.024918, Acc: 0.992701\n",
      "Test Loss: 0.060023, Acc: 0.992875\n",
      "\n",
      "epoch 17\n",
      "**********\n",
      "Finish 17 epoch, Loss: 0.025414, Acc: 0.992487\n",
      "Test Loss: 0.081598, Acc: 0.992585\n",
      "\n",
      "epoch 18\n",
      "**********\n",
      "Finish 18 epoch, Loss: 0.024409, Acc: 0.992707\n",
      "Test Loss: 0.075168, Acc: 0.992011\n",
      "\n",
      "epoch 19\n",
      "**********\n",
      "Finish 19 epoch, Loss: 0.023752, Acc: 0.992970\n",
      "Test Loss: 0.049177, Acc: 0.993765\n",
      "\n",
      "epoch 20\n",
      "**********\n",
      "Finish 20 epoch, Loss: 0.023343, Acc: 0.993097\n",
      "Test Loss: 0.059661, Acc: 0.993921\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "kddcup99 = datasets.fetch_kddcup99()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0 b'tcp' b'http' b'SF' 219 643 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 6 6 0.0\n",
      " 0.0 0.0 0.0 1.0 0.0 0.0 44 255 1.0 0.0 0.02 0.03 0.0 0.0 0.0 0.0] b'normal.'\n"
     ]
    }
   ],
   "source": [
    "test_data = kddcup99.data[888]\n",
    "label = kddcup99.target[888]\n",
    "print(test_data, label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "_data = dataset.encode(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "__data = torch.from_numpy(np.pad(_data, (0, 64 - len(_data)), 'constant').astype(np.float32)).reshape(-1, 1, 8, 8).cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[   1.7615,   16.6235,   16.5394,   19.2315,   14.2301,   19.8753,\n",
       "            7.7450,   16.8384,   15.4089, -126.3723,    7.7885,   29.8984,\n",
       "           14.1022,   14.9759,    4.9610,  -57.6466,   14.1995,  -18.0559,\n",
       "          -23.1028,   15.2467,  -22.2353,   13.6960,   16.9799]], device='cuda:0')"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model(__data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(data, multiple=False):\n",
    "    _data = dataset.encode(data)\n",
    "    _data = torch.from_numpy(\n",
    "        np.pad(_data, (0, 64 - len(_data)), 'constant').astype(np.float32)\n",
    "    ).reshape(-1, 1, 8, 8).cuda()\n",
    "    _out = int(torch.max(model(_data).data, 1)[1].cpu().numpy())\n",
    "    return dataset.decode(_out, label=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "b'normal.'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predict(test_data)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
